This post serves as an introduction into an exploration of the Diabetes Epidemic in North Carolina. Through a series of post the project will evovle to explore the data avialable and provide possible solutions to the problem. This idea is based off a report written as my Masters Capstone. This post will answer the following questions
- What is the overall trend of diabetes in North Carolina.
# Attach these packages so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path
library(magrittr) # enables piping : %>%
library(dplyr) # data wrangling
library(ggplot2) # graphs
library(tidyr) # data tidying
library(maps)
library(mapdata)
library(sf)
library(gganimate)
#set ggplot theme
ggplot2::theme_set(theme_bw())
#Put code in here. It doesn't call a chunk in the codebehind file.
The data for this exploration comes from several different sources:
The Diabetes data was taken from the US Diabetes Surveillance System; www.cdc.gov/diabetes/data; Division of Diabetes Translation - Centers for Disease Control and Prevention. The data was downloaded by year, and compiled into one set.
The list of rural counties is compiled from The US Census Bureau, it includes all North Carolina Counties that are at least 90% rural, more on the topic can be found here Rural America
# load the data, and have all column names in lowercase
nc_diabetes_data_raw <- readr::read_rds("./data-public/derived/nc-diabetes-data.rds") %>%
rename_all(tolower)
us_diabetes_data_raw <- readr::read_csv("data-public/raw/us_diabetes_totals.csv",
skip = 2)
rural_counties <- readr::read_csv("./data-public/metadata/rural-counties.csv")
county_centers_raw <- readxl::read_xlsx("./data-public/raw/nc_county_centers.xlsx", col_names = c("county", "lat","long"))
The Diabetes data comes quite tidy from the CDC, the script to combine each year can be found here The only tweaks done here are to combine the rural countines column, and the map data for creating maps.
county_centers <- county_centers_raw %>%
mutate_all(~stringr::str_replace_all(.,
c("\\°" = ""
,"\\+" = ""
,"\\–" = "-"
)
)
) %>%
mutate_at(c("lat","long"),as.numeric) %>%
mutate_at("county", tolower)
us_diabetes_data <- us_diabetes_data_raw %>%
filter(Year >= 2006) %>%
select( "Year","Total - Percentage") %>%
rename(year = Year , us_pct = `Total - Percentage`)
#join us totals
nc_diabetes_data <- nc_diabetes_data_raw %>%
mutate(
rural = county %in% rural_counties$rural_counties
) %>%
mutate_at("county",tolower) %>%
left_join(us_diabetes_data)
When examining North Carolina as a whole we can see that NC has been trending much higher than the United States as a whole. We see that in 2016 there was a large spike in diagnosied cases, unfortunally this is the last year of data available to see if this upward trend contiunes.
nc_diabetes_data %>%
group_by(year) %>%
summarise(
pct = mean(percentage)
,us_pct = mean(us_pct)
) %>%
pivot_longer(
cols = c("pct", "us_pct")
,names_to = "metric"
,values_to = "values"
) %>%
mutate(
metric = factor(metric
,levels = c("pct","us_pct")
,labels = c("NC", "National"))
) %>%
ggplot(aes(x = year, y = values, color = metric)) +
geom_line() +
geom_point(shape = 21, size = 3) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
scale_color_brewer(palette = "Dark2") +
labs(
x = NULL
,y = NULL
,color = NULL
,title = "Percent of Adults (20+) with Diagnosed Diabetes"
)
When examing the further breakdown of Urban Vs Rural we see that while all of North Carolina has a higher percentage of diagnosied cases than the US average, Rural North Carolina has a higher percetage of cases vs Urban NC.
d <- nc_diabetes_data %>%
select(-us_pct) %>%
mutate(
pct_rural = if_else(rural == TRUE, percentage, NULL)
,pct_urban = if_else(rural == FALSE, percentage, NULL)
) %>%
select(-countyfips,-percentage) %>%
group_by(year) %>%
summarise(
pct_rural = mean(pct_rural,na.rm = TRUE)
,pct_urban = mean(pct_urban,na.rm = TRUE)
) %>% left_join(us_diabetes_data) %>%
pivot_longer(
cols = c("us_pct", "pct_rural","pct_urban")
,names_to = "metric"
,values_to = "value"
,values_drop_na = TRUE
) %>%
mutate(
metric = factor(metric,
levels = c("pct_rural","pct_urban","us_pct")
,labels = c("Rural","Urban","US")
)
)
d %>% ggplot(aes(x = year, y = value, color = metric)) +
geom_line() +
geom_point(shape = 21, size = 3) +
# geom_smooth(method = "lm",se = FALSE) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
scale_color_brewer(palette = "Dark2") +
labs(
x = NULL
,y = NULL
,color = NULL
,title = "Percent of Adults (20+) with Diagnosed Diabetes \nDisplaying Rural vs Urban"
)
# ggpmisc::stat_poly_eq(formula = y ~ + x
# ,aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~"))
# ,parse = TRUE
# )
# 2006 Map Graph
counties <- st_as_sf(map("county",region = "north carolina", plot = FALSE,fill = TRUE)) %>%
mutate_at("ID", ~stringr::str_remove(.,"north carolina,")) %>%
left_join(nc_diabetes_data, by = c("ID" = "county"))
county_centers <- st_as_sf(county_centers, coords = c("long","lat")
,remove = FALSE, agr = "constant", crs = 4326)
county_centers <- county_centers %>%
left_join(nc_diabetes_data) %>%
mutate(
rural = if_else(rural,"R","U")
)
county_centers_2006 <- county_centers %>% filter(year == 2006)
county_centers_2016 <- county_centers %>% filter(year == 2016)
counties %>%
filter(year == 2006) %>%
ggplot() +
geom_sf(aes(fill = rural)) +
geom_sf(data = county_centers_2006
,aes(size = percentage)
,shape = 21
,fill = "#0571b0"
,color = "black"
,alpha = 0.8) +
scale_size(range = c(1,10)) +
scale_fill_viridis_d(alpha = 0.5, direction = -1) +
guides(
fill = guide_legend(title = "Rural")
,size = guide_legend(title = "Percentage")
) +
labs(
title = "Diagnosied Diabetes by County 2006"
)
counties %>%
filter(year == 2006) %>%
ggplot() +
geom_sf(aes(fill = percentage)) +
scale_fill_viridis_c(alpha = 0.6, direction = -1) +
geom_sf_text(aes(label = rural), data = county_centers_2006, color = "#666666") +
labs(
title = "Diagnosied Diabetes by County 2006"
,x = NULL
,y = NULL
,fill = "Percentage"
)
#2016 Map
counties %>%
filter(year == 2016) %>%
ggplot() +
geom_sf(aes(fill = rural)) +
geom_sf(data = county_centers_2016
,aes(size = percentage)
,shape = 21
,fill = "#0571b0"
,color = "black"
,alpha = 0.8) +
scale_size(range = c(1,10)) +
scale_fill_viridis_d(alpha = 0.5, direction = -1) +
guides(
fill = guide_legend(title = "Rural")
,size = guide_legend(title = "Percentage")
) +
labs(
title = "Diagnosied Diabetes by County 2016"
)
counties %>%
filter(year == 2016) %>%
mutate(
percentage = if_else(percentage <25,percentage, NULL)
) %>%
ggplot() +
geom_sf(aes(fill = percentage)) +
scale_fill_viridis_c(alpha = 0.6
,direction = -1
) +
geom_sf_text(aes(label = rural), data = county_centers_2016, color = "#666666") +
labs(
title = "Diagnosied Diabetes by County 2016"
,x = NULL
,y = NULL
,fill = "Percentage"
,caption = "Note : Jones County = 27.1%"
)
g <- counties %>%
ggplot() +
geom_sf(aes(fill = percentage)) +
scale_fill_viridis_c(alpha = 0.6
,direction = -1
) +
transition_manual(year)
g <- animate(g,end_pause = 10)
anim_save("./analysis/blogposts/basic-exploration/figure_rmd/animate_1.gif")
For the sake of documentation and reproducibility, the current report was rendered in the following environment. Click the line below to expand.
Environment
- Session info -------------------------------------------------------------------------------------------------------
setting value
version R version 3.6.2 (2019-12-12)
os Windows 10 x64
system x86_64, mingw32
ui RTerm
language (EN)
collate English_United States.1252
ctype English_United States.1252
tz America/New_York
date 2020-05-14
- Packages -----------------------------------------------------------------------------------------------------------
package * version date lib source
assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.1)
backports 1.1.5 2019-10-02 [1] CRAN (R 3.6.1)
callr 3.4.1 2020-01-24 [1] CRAN (R 3.6.2)
cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.6.1)
class 7.3-15 2019-01-01 [2] CRAN (R 3.6.2)
classInt 0.4-3 2020-04-07 [1] CRAN (R 3.6.3)
cli 2.0.1 2020-01-08 [1] CRAN (R 3.6.2)
colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.1)
crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.1)
DBI 1.1.0 2019-12-15 [1] CRAN (R 3.6.2)
desc 1.2.0 2018-05-01 [1] CRAN (R 3.6.2)
devtools 2.2.1 2019-09-24 [1] CRAN (R 3.6.2)
digest 0.6.21 2019-09-20 [1] CRAN (R 3.6.1)
dplyr * 0.8.3 2019-07-04 [1] CRAN (R 3.6.1)
e1071 1.7-3 2019-11-26 [1] CRAN (R 3.6.3)
ellipsis 0.3.0 2019-09-20 [1] CRAN (R 3.6.1)
evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.1)
fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.2)
farver 2.0.3 2020-01-16 [1] CRAN (R 3.6.2)
fs 1.3.1 2019-05-06 [1] CRAN (R 3.6.1)
gganimate * 1.0.5 2020-02-09 [1] CRAN (R 3.6.2)
ggplot2 * 3.3.0 2020-03-05 [1] CRAN (R 3.6.3)
gifski 0.8.6 2018-09-28 [1] CRAN (R 3.6.2)
glue 1.3.1 2019-03-12 [1] CRAN (R 3.6.1)
gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.1)
hms 0.5.3 2020-01-08 [1] CRAN (R 3.6.2)
htmltools 0.4.0 2019-10-04 [1] CRAN (R 3.6.1)
KernSmooth 2.23-16 2019-10-15 [2] CRAN (R 3.6.2)
knitr * 1.28 2020-02-06 [1] CRAN (R 3.6.2)
labeling 0.3 2014-08-23 [1] CRAN (R 3.6.0)
lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.3)
magrittr * 1.5 2014-11-22 [1] CRAN (R 3.6.1)
mapdata * 2.3.0 2018-03-30 [1] CRAN (R 3.6.2)
maps * 3.3.0 2018-04-03 [1] CRAN (R 3.6.2)
memoise 1.1.0 2017-04-21 [1] CRAN (R 3.6.2)
munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.1)
pillar 1.4.3 2019-12-20 [1] CRAN (R 3.6.2)
pkgbuild 1.0.6 2019-10-09 [1] CRAN (R 3.6.2)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.1)
pkgload 1.0.2 2018-10-29 [1] CRAN (R 3.6.2)
plyr 1.8.6 2020-03-03 [1] CRAN (R 3.6.3)
prettyunits 1.1.1 2020-01-24 [1] CRAN (R 3.6.2)
processx 3.4.1 2019-07-18 [1] CRAN (R 3.6.2)
progress 1.2.2 2019-05-16 [1] CRAN (R 3.6.1)
ps 1.3.0 2018-12-21 [1] CRAN (R 3.6.1)
purrr 0.3.2 2019-03-15 [1] CRAN (R 3.6.1)
R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.2)
RColorBrewer 1.1-2 2014-12-07 [1] CRAN (R 3.6.0)
Rcpp 1.0.2 2019-07-25 [1] CRAN (R 3.6.1)
readr 1.3.1 2018-12-21 [1] CRAN (R 3.6.1)
readxl 1.3.1 2019-03-13 [1] CRAN (R 3.6.1)
remotes 2.1.0 2019-06-24 [1] CRAN (R 3.6.2)
rlang 0.4.6 2020-05-02 [1] CRAN (R 3.6.2)
rmarkdown 2.1 2020-01-20 [1] CRAN (R 3.6.2)
rprojroot 1.3-2 2018-01-03 [1] CRAN (R 3.6.2)
scales 1.1.0 2019-11-18 [1] CRAN (R 3.6.2)
sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.2)
sf * 0.9-3 2020-05-04 [1] CRAN (R 3.6.3)
stringi 1.4.4 2020-01-09 [1] CRAN (R 3.6.2)
stringr 1.4.0 2019-02-10 [1] CRAN (R 3.6.1)
testthat 2.3.1 2019-12-01 [1] CRAN (R 3.6.2)
tibble 3.0.1 2020-04-20 [1] CRAN (R 3.6.3)
tidyr * 1.0.2 2020-01-24 [1] CRAN (R 3.6.2)
tidyselect 0.2.5 2018-10-11 [1] CRAN (R 3.6.1)
tweenr 1.0.1 2018-12-14 [1] CRAN (R 3.6.2)
units 0.6-6 2020-03-16 [1] CRAN (R 3.6.3)
usethis 1.5.1 2019-07-04 [1] CRAN (R 3.6.2)
vctrs 0.2.4 2020-03-10 [1] CRAN (R 3.6.3)
viridisLite 0.3.0 2018-02-01 [1] CRAN (R 3.6.1)
withr 2.1.2 2018-03-15 [1] CRAN (R 3.6.1)
xfun 0.12 2020-01-13 [1] CRAN (R 3.6.2)
yaml 2.2.0 2018-07-25 [1] CRAN (R 3.6.0)
[1] C:/Users/belangew/Documents/R/win-library/3.6
[2] C:/Program Files/R/R-3.6.2/library